Make a prediction about coal production



In [3]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()



In [5]:

    
df = pd.read_csv("../data/cleaned_coalpublic2013.csv", index_col="MSHA ID")

df.head()









    Out[5]:







  
    
      
      Year
      Mine_Name
      Mine_State
      Mine_County
      Mine_Status
      Mine_Type
      Company_Type
      Operation_Type
      Operating_Company
      Operating_Company_Address
      Union_Code
      Coal_Supply_Region
      Production_(short_tons)
      Average_Employees
      Labor_Hours
      log_production
    
    
      MSHA ID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      103381
      2013
      Tacoa Highwall Miner
      Alabama
      Bibb
      Active, men working, not producing
      Surface
      Independent Producer Operator
      Mine only
      Jesse Creek Mining, Llc
      1615 Kent Dairy Rd, Alabaster, AL 35007
      
      Appalachia Southern
      56004
      10
      22392
      10.933178
    
    
      103404
      2013
      Reid School Mine
      Alabama
      Blount
      Permanently abandoned
      Surface
      Independent Producer Operator
      Mine only
      Taft Coal Sales & Associates,
      3000 Riverchase Galleria Ste 1, Birmingham, AL...
      UNIT
      Appalachia Southern
      28807
      18
      28447
      10.268374
    
    
      100759
      2013
      North River #1 Underground Min
      Alabama
      Fayette
      Active, men working, not producing
      Underground
      Independent Producer Operator
      Mine and Preparation Plant
      Jim Walter Resources Inc
      3114 County Rd 63 S, Berry, AL 35546
      UNIT
      Appalachia Southern
      1440115
      183
      474784
      14.180234
    
    
      103246
      2013
      Bear Creek
      Alabama
      Franklin
      Active
      Surface
      Independent Producer Operator
      Mine only
      Birmingham Coal & Coke Co., In
      912 Edenton Street, Birmingham, AL 35242
      
      Appalachia Southern
      87587
      13
      29193
      11.380388
    
    
      103451
      2013
      Knight Mine
      Alabama
      Franklin
      Active
      Surface
      Independent Producer Operator
      Mine only
      Birmingham Coal & Coke Co., In
      P.O. Box 354, Lynn, AL 35242
      
      Appalachia Southern
      147499
      27
      46393
      11.901577



In [8]:

    
len(df)









    Out[8]:





1061



In [32]:

    
for column in df.columns:
    print column









    



Year
Mine_Name
Mine_State
Mine_County
Mine_Status
Mine_Type
Company_Type
Operation_Type
Operating_Company
Operating_Company_Address
Union_Code
Coal_Supply_Region
Production_(short_tons)
Average_Employees
Labor_Hours



In [7]:

    
# log normal distribution
df['log_production'].hist()









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0xcb3d128>



In [9]:

    
df.Mine_Status.unique()









    Out[9]:





array(['Active, men working, not producing', 'Permanently abandoned',
       'Active', 'Temporarily closed', 'New, under construction'], dtype=object)



In [11]:

    
df[['Mine_Status', 'log_production']].groupby('Mine_Status').mean()









    Out[11]:







  
    
      
      log_production
    
    
      Mine_Status
      
    
  
  
    
      Active
      11.977453
    
    
      Active, men working, not producing
      10.499962
    
    
      New, under construction
      3.951244
    
    
      Permanently abandoned
      9.896046
    
    
      Temporarily closed
      9.162933

predict the production of coal mines



In [13]:

    
for column in df.columns:
    print column









    



Year
Mine_Name
Mine_State
Mine_County
Mine_Status
Mine_Type
Company_Type
Operation_Type
Operating_Company
Operating_Company_Address
Union_Code
Coal_Supply_Region
Production_(short_tons)
Average_Employees
Labor_Hours
log_production



In [14]:

    
df.Union_Code.unique()









    Out[14]:





array([' ', 'UNIT', 'United Mine Workers of America', 'INTE',
       'International Union of Operation Engineers',
       'Scotia Employees Association', 'Western Energy Workers'], dtype=object)



In [15]:

    
df.Year.unique()









    Out[15]:





array([2013], dtype=int64)



In [17]:

    
features = ['Average_Employees',
            'Labor_Hours']

categorials = ['Mine_State',
               'Mine_County',
               'Mine_Status',
               'Mine_Type',
               'Company_Type',
               'Operation_Type',
               'Operating_Company',
               'Operating_Company_Address',
               'Union_Code',
               'Coal_Supply_Region']

target = 'log_production'



In [18]:

    
fig = plt.subplots(figsize=(14,8))
sns.set_context('poster')
sns.violinplot(y='Mine_Status', x="log_production", data=df, split=True, inner="stick");
plt.tight_layout()



In [19]:

    
fig = plt.subplots(figsize=(14,8))
sns.set_context('poster')
sns.violinplot(y='Company_Type', x="log_production", data=df, split=True, inner="stick");
plt.tight_layout()



In [20]:

    
df.Company_Type.unique()









    Out[20]:





array(['Independent Producer Operator', 'Operating Subsidiary',
       'Contractor'], dtype=object)



In [24]:

    
pd.get_dummies(df.Company_Type).sample(50).head()









    Out[24]:







  
    
      
      Contractor
      Independent Producer Operator
      Operating Subsidiary
    
    
      MSHA ID
      
      
      
    
  
  
    
      4407220
      0
      0
      1
    
    
      4103164
      0
      0
      1
    
    
      1518691
      0
      1
      0
    
    
      3609127
      0
      0
      1
    
    
      3608613
      0
      1
      0



In [27]:

    
dummy_categoricals = []
for categorical in categorials:
    print categorical, len(df[categorical].unique())
    # avoid the dummy variable trap!
    drop_var = sorted(df[categorical].unique())[-1]
    temp_df = pd.get_dummies(df[categorical], prefix=categorical)
    df = pd.concat([df, temp_df], axis=1)
    temp_df.drop('_'.join([categorical, str(drop_var)]), axis=1, inplace=True)
    dummy_categoricals += temp_df.columns.tolist()









    



Mine_State 29
Mine_County 164
Mine_Status 5
Mine_Type 3
Company_Type 3
Operation_Type 2
Operating_Company 752
Operating_Company_Address 692
Union_Code 7
Coal_Supply_Region 8



In [28]:

    
dummy_categoricals[:10]









    Out[28]:





['Mine_State_Alabama',
 'Mine_State_Alaska',
 'Mine_State_Arizona',
 'Mine_State_Arkansas',
 'Mine_State_Colorado',
 'Mine_State_Illinois',
 'Mine_State_Indiana',
 'Mine_State_Kansas',
 'Mine_State_Kentucky (East)',
 'Mine_State_Kentucky (West)']

build our model



In [29]:

    
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor









    



C:\Anaconda2\envs\py2\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [30]:

    
len(dummy_categoricals)









    Out[30]:





1655



In [32]:

    
train, test = train_test_split(df, test_size=0.3)



In [33]:

    
train.head()









    Out[33]:







  
    
      
      Year
      Mine_Name
      Mine_State
      Mine_County
      Mine_Status
      Mine_Type
      Company_Type
      Operation_Type
      Operating_Company
      Operating_Company_Address
      ...
      Union_Code_United Mine Workers of America
      Union_Code_Western Energy Workers
      Coal_Supply_Region_Appalachia Central
      Coal_Supply_Region_Appalachia Northern
      Coal_Supply_Region_Appalachia Southern
      Coal_Supply_Region_Illinois Basin
      Coal_Supply_Region_Interior
      Coal_Supply_Region_Powder River Basin
      Coal_Supply_Region_Uinta Region
      Coal_Supply_Region_Western
    
    
      MSHA ID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      3604186
      2013
      A W Long Coal Company
      Pennsylvania (Bituminous)
      Clearfield
      Active
      Surface
      Independent Producer Operator
      Mine only
      A W Long Coal Company
      154 Long Lane, Munson, PA 16860
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      1202316
      2013
      Freelandville Underground
      Indiana
      Knox
      Permanently abandoned
      Underground
      Independent Producer Operator
      Mine only
      Triad Underground Mining, L.L.
      14972 East S.R. 58, Edwardsport, IN 47528
      ...
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      1519196
      2013
      #1
      Kentucky (East)
      Pike
      Active, men working, not producing
      Underground
      Independent Producer Operator
      Mine only
      Freedom Coal, Inc.
      P.O. Box 249, Stanville, KY 41659
      ...
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      1202441
      2013
      Wild Boar Mine
      Indiana
      Warrick
      Active
      Surface
      Operating Subsidiary
      Mine only
      Peabody Wild Boar Mining, Llc
      566 Dickeyville Rd., Lynnville, IN 47619
      ...
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      1518914
      2013
      Highwall Miner #1
      Kentucky (East)
      Pike
      Active
      Surface
      Contractor
      Mine only
      Jrc Mining, Inc
      P.O. Box 952, Hyden, KY 41749
      ...
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 1710 columns



In [35]:

    
rf = RandomForestRegressor(n_estimators=100, oob_score=True)



In [36]:

    
rf.fit(train[features + dummy_categoricals], train[target])









    Out[36]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)



In [38]:

    
fig = plt.subplots(figsize=(8,8))
sns.regplot(test[target], rf.predict(test[features + dummy_categoricals]))
plt.ylabel('Predicted Production')
plt.xlim(0,22)
plt.ylim(0,22)
plt.tight_layout()



In [39]:

    
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error



In [41]:

    
predicted = rf.predict(test[features + dummy_categoricals])
r2_score(test[target], predicted)









    Out[41]:





0.86404149543573749



In [42]:

    
explained_variance_score(test[target], predicted)









    Out[42]:





0.86510393552390252



In [43]:

    
mean_squared_error(test[target], predicted)









    Out[43]:





0.73959524396895193



In [44]:

    
rf_importances = pd.DataFrame({'name':train[features + dummy_categoricals].columns, 
                               'importance':rf.feature_importances_
                             }).sort_values(by='importance', ascending=False).reset_index(drop=True)
rf_importances.head(20)









    Out[44]:







  
    
      
      importance
      name
    
  
  
    
      0
      0.835921
      Labor_Hours
    
    
      1
      0.056142
      Average_Employees
    
    
      2
      0.006887
      Mine_Type_Surface
    
    
      3
      0.005849
      Coal_Supply_Region_Powder River Basin
    
    
      4
      0.003486
      Coal_Supply_Region_Illinois Basin
    
    
      5
      0.002567
      Coal_Supply_Region_Appalachia Central
    
    
      6
      0.002179
      Operating_Company_Address_P.O. Box 5, Van, WV ...
    
    
      7
      0.002119
      Mine_Status_Active
    
    
      8
      0.001933
      Operating_Company_West Fork Coal, Llc
    
    
      9
      0.001843
      Company_Type_Independent Producer Operator
    
    
      10
      0.001623
      Mine_County_Harlan
    
    
      11
      0.001494
      Operating_Company_Address_100 Cranberry Creek ...
    
    
      12
      0.001356
      Mine_Status_Active, men working, not producing
    
    
      13
      0.001305
      Operating_Company_Cambria Fuel Management Compan
    
    
      14
      0.001208
      Mine_State_West Virginia (Southern)
    
    
      15
      0.001202
      Operating_Company_Address_P.O. Box 190, Lovely...
    
    
      16
      0.001190
      Operating_Company_Mayo Resources Inc
    
    
      17
      0.001139
      Mine_State_West Virginia (Southern)
    
    
      18
      0.001094
      Mine_County_Martin
    
    
      19
      0.001067
      Mine_County_Campbell

	Year	Mine_Name	Mine_State	Mine_County	Mine_Status	Mine_Type	Company_Type	Operation_Type	Operating_Company	Operating_Company_Address	Union_Code	Coal_Supply_Region	Production_(short_tons)	Average_Employees	Labor_Hours	log_production
MSHA ID
103381	2013	Tacoa Highwall Miner	Alabama	Bibb	Active, men working, not producing	Surface	Independent Producer Operator	Mine only	Jesse Creek Mining, Llc	1615 Kent Dairy Rd, Alabaster, AL 35007		Appalachia Southern	56004	10	22392	10.933178
103404	2013	Reid School Mine	Alabama	Blount	Permanently abandoned	Surface	Independent Producer Operator	Mine only	Taft Coal Sales & Associates,	3000 Riverchase Galleria Ste 1, Birmingham, AL...	UNIT	Appalachia Southern	28807	18	28447	10.268374
100759	2013	North River #1 Underground Min	Alabama	Fayette	Active, men working, not producing	Underground	Independent Producer Operator	Mine and Preparation Plant	Jim Walter Resources Inc	3114 County Rd 63 S, Berry, AL 35546	UNIT	Appalachia Southern	1440115	183	474784	14.180234
103246	2013	Bear Creek	Alabama	Franklin	Active	Surface	Independent Producer Operator	Mine only	Birmingham Coal & Coke Co., In	912 Edenton Street, Birmingham, AL 35242		Appalachia Southern	87587	13	29193	11.380388
103451	2013	Knight Mine	Alabama	Franklin	Active	Surface	Independent Producer Operator	Mine only	Birmingham Coal & Coke Co., In	P.O. Box 354, Lynn, AL 35242		Appalachia Southern	147499	27	46393	11.901577

	log_production
Mine_Status
Active	11.977453
Active, men working, not producing	10.499962
New, under construction	3.951244
Permanently abandoned	9.896046
Temporarily closed	9.162933

	Year	Mine_Name	Mine_State	Mine_County	Mine_Status	Mine_Type	Company_Type	Operation_Type	Operating_Company	Operating_Company_Address	...	Union_Code_United Mine Workers of America	Union_Code_Western Energy Workers	Coal_Supply_Region_Appalachia Central	Coal_Supply_Region_Appalachia Northern	Coal_Supply_Region_Appalachia Southern	Coal_Supply_Region_Illinois Basin	Coal_Supply_Region_Interior	Coal_Supply_Region_Powder River Basin	Coal_Supply_Region_Uinta Region	Coal_Supply_Region_Western
MSHA ID
3604186	2013	A W Long Coal Company	Pennsylvania (Bituminous)	Clearfield	Active	Surface	Independent Producer Operator	Mine only	A W Long Coal Company	154 Long Lane, Munson, PA 16860	...	0	0	0	1	0	0	0	0	0	0
1202316	2013	Freelandville Underground	Indiana	Knox	Permanently abandoned	Underground	Independent Producer Operator	Mine only	Triad Underground Mining, L.L.	14972 East S.R. 58, Edwardsport, IN 47528	...	0	0	0	0	0	1	0	0	0	0
1519196	2013	#1	Kentucky (East)	Pike	Active, men working, not producing	Underground	Independent Producer Operator	Mine only	Freedom Coal, Inc.	P.O. Box 249, Stanville, KY 41659	...	0	0	1	0	0	0	0	0	0	0
1202441	2013	Wild Boar Mine	Indiana	Warrick	Active	Surface	Operating Subsidiary	Mine only	Peabody Wild Boar Mining, Llc	566 Dickeyville Rd., Lynnville, IN 47619	...	0	0	0	0	0	1	0	0	0	0
1518914	2013	Highwall Miner #1	Kentucky (East)	Pike	Active	Surface	Contractor	Mine only	Jrc Mining, Inc	P.O. Box 952, Hyden, KY 41749	...	0	0	1	0	0	0	0	0	0	0

	importance	name
0	0.835921	Labor_Hours
1	0.056142	Average_Employees
2	0.006887	Mine_Type_Surface
3	0.005849	Coal_Supply_Region_Powder River Basin
4	0.003486	Coal_Supply_Region_Illinois Basin
5	0.002567	Coal_Supply_Region_Appalachia Central
6	0.002179	Operating_Company_Address_P.O. Box 5, Van, WV ...
7	0.002119	Mine_Status_Active
8	0.001933	Operating_Company_West Fork Coal, Llc
9	0.001843	Company_Type_Independent Producer Operator
10	0.001623	Mine_County_Harlan
11	0.001494	Operating_Company_Address_100 Cranberry Creek ...
12	0.001356	Mine_Status_Active, men working, not producing
13	0.001305	Operating_Company_Cambria Fuel Management Compan
14	0.001208	Mine_State_West Virginia (Southern)
15	0.001202	Operating_Company_Address_P.O. Box 190, Lovely...
16	0.001190	Operating_Company_Mayo Resources Inc
17	0.001139	Mine_State_West Virginia (Southern)
18	0.001094	Mine_County_Martin
19	0.001067	Mine_County_Campbell